Computer Shopper 242

home *** CD-ROM | disk | FTP | other *** search

/ Computer Shopper 242 / Issue 242 - April 2008 - DPCS0408DVD.ISO / Software Money Savers / VirtualDub / Source / VirtualDub-1.7.7-src.7z / src / Priss / source / a64_polyphase.asm next >

Wrap

Assembly Source File | 2006-03-14 | 14.2 KB | 421 lines

; Priss (NekoAmp 2.0) - MPEG-1/2 audio decoding library ; Copyright (C) 2003-2004 Avery Lee ; ; This program is free software; you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation; either version 2 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program; if not, write to the Free Software ; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. .const align 16 c2 real4 0.92387953251128674,0.92387953251128674,0.92387953251128674,0.92387953251128674 c4 real4 0.70710678118654757,0.70710678118654757,0.70710678118654757,0.70710678118654757 c6 real4 0.38268343236508984,0.38268343236508984,0.38268343236508984,0.38268343236508984 ; [1 .5 .5 -.5 .5 .5 .5 .5] ./ cos(pi*[0 5 6 1 4 7 2 3]/16) d real4 +1.00000000000000,+1.00000000000000,+1.00000000000000,+1.00000000000000 real4 +0.89997622313642,+0.89997622313642,+0.89997622313642,+0.89997622313642 real4 +1.30656296487638,+1.30656296487638,+1.30656296487638,+1.30656296487638 real4 -0.50979557910416,-0.50979557910416,-0.50979557910416,-0.50979557910416 real4 +0.70710678118655,+0.70710678118655,+0.70710678118655,+0.70710678118655 real4 +2.56291544774151,+2.56291544774151,+2.56291544774151,+2.56291544774151 real4 +0.54119610014620,+0.54119610014620,+0.54119610014620,+0.54119610014620 real4 +0.60134488693505,+0.60134488693505,+0.60134488693505,+0.60134488693505 invother dd 0, 80000000h, 0, 80000000h invall dd 80000000h, 80000000h, 80000000h, 80000000h extern leecoef1 : far extern leecoef2 : far .code vdasm_mpegaudio_polyphase_dctinputbutterflies proc public xor r9, r9 mov r8, 48 lea r10, leecoef1 lea r11, leecoef2 xloop: movups xmm0, [rdx+r9] ;xmm0 = in[i] movups xmm1, [rdx+r8] ;xmm1 = in[15-i] movups xmm2, [rdx+r9+64] ;xmm2 = in[i+16] movups xmm3, [rdx+r8+64] ;xmm3 = in[31-i] shufps xmm1, xmm1, 00011011b shufps xmm3, xmm3, 00011011b ;butterfly for first decomposition movaps xmm4, xmm0 movaps xmm5, xmm1 addps xmm0, xmm3 ;xmm0 = y0 = x0+x3 addps xmm1, xmm2 ;xmm1 = y1 = x1+x2 subps xmm4, xmm3 ;xmm4 = y2 = x0-x3 subps xmm5, xmm2 ;xmm5 = y3 = x1-x2 mulps xmm4, [r10+r9] mulps xmm5, [r10+r9+32] ;butterfly for second decomposition movaps xmm2, xmm0 movaps xmm3, xmm4 addps xmm0, xmm1 ;xmm0 = z0 = y0+y1 subps xmm2, xmm1 ;xmm2 = z1 = y0-y1 addps xmm3, xmm5 ;xmm3 = z2 = y2+y3 subps xmm4, xmm5 ;xmm4 = z3 = y2-y3 mulps xmm2, [r11+r9] mulps xmm4, [r11+r9] ;interleave in 0-2-1-3 order movaps xmm1, xmm0 unpcklps xmm0, xmm3 ;xmm0 = z2B | z0B | z2A | z0A unpckhps xmm1, xmm3 ;xmm1 = z2D | z0D | z2C | z0C movaps xmm3, xmm2 unpcklps xmm2, xmm4 ;xmm2 = z3B | z1B | z3A | z1A unpckhps xmm3, xmm4 ;xmm3 = z3D | z1D | z3C | z1C movlps qword ptr [rcx ], xmm0 movlps qword ptr [rcx+ 8], xmm2 movhps qword ptr [rcx+16], xmm0 movhps qword ptr [rcx+24], xmm2 movlps qword ptr [rcx+32], xmm1 movlps qword ptr [rcx+40], xmm3 movhps qword ptr [rcx+48], xmm1 movhps qword ptr [rcx+56], xmm3 add rcx, 64 add r9, 16 sub r8, 16 cmp r9, r8 jb xloop ret vdasm_mpegaudio_polyphase_dctinputbutterflies endp vdasm_mpegaudio_polyphase_dct4x8 proc public ; See the FPU version to get an idea of the flow of this AAN ; implementation. Note that we do all four DCTs in parallel! movlhps xmm14, xmm6 movlhps xmm15, xmm7 ;even part - B3 (4a) movaps xmm0, [rcx+0*16] ;xmm0 = s[0] movaps xmm1, [rcx+1*16] ;xmm1 = s[1] movaps xmm2, [rcx+2*16] ;xmm2 = s[2] movaps xmm3, [rcx+3*16] ;xmm3 = s[3] addps xmm0, [rcx+7*16] ;xmm0 = s[0]+s[7] addps xmm1, [rcx+6*16] ;xmm1 = s[1]+s[6] addps xmm2, [rcx+5*16] ;xmm2 = s[2]+s[5] addps xmm3, [rcx+4*16] ;xmm3 = s[3]+s[4] ;even part - B2/~B1a (4a) movaps xmm4, xmm0 addps xmm0, xmm3 ;xmm0 = b2[0] = b3[0]+b3[3] movaps xmm5, xmm1 addps xmm1, xmm2 ;xmm1 = b2[1] = b3[1]+b2[2] subps xmm4, xmm3 ;xmm4 = b2[2] = b3[0]-b3[3] subps xmm5, xmm2 ;xmm5 = b2[3] = b3[1]-b3[2] ;even part - ~B1b/M (3a1m) movaps xmm2, xmm0 subps xmm4, xmm5 addps xmm0, xmm1 ;xmm0 = m[0] = b2[0] + b2[1] mulps xmm4, c4 ;xmm4 = m[2] = (b2[2] - b2[3])*c4 subps xmm2, xmm1 ;xmm2 = m[1] = b2[0] - b2[1] ;even part - R1 (2a) movaps xmm3, xmm4 subps xmm4, xmm5 ;xmm4 = r1[3] = m[2]-m[3] addps xmm3, xmm5 ;xmm3 = r1[2] = m[2]+m[3] ;even part - d (4m) mulps xmm0, [d+0*16] ;xmm0 = out[0] = r1[0]*d[0] mulps xmm2, [d+4*16] ;xmm2 = out[4] = r1[1]*d[4] mulps xmm3, [d+2*16] ;xmm3 = out[2] = r1[2]*d[2] mulps xmm4, [d+6*16] ;xmm4 = out[6] = r1[3]*d[6] ;odd part - B3 (4a) movaps xmm1, [rcx+0*16] movaps xmm5, [rcx+1*16] movaps xmm6, [rcx+2*16] movaps xmm7, [rcx+3*16] subps xmm1, [rcx+7*16] ;xmm1 = b3[4] = s[0]-s[7] subps xmm5, [rcx+6*16] ;xmm5 = b3[5] = s[1]-s[6] subps xmm6, [rcx+5*16] ;xmm6 = b3[6] = s[2]-s[5] subps xmm7, [rcx+4*16] ;xmm7 = b3[7] = s[3]-s[4] ;even part - writeout movaps [rcx+0*16], xmm0 movaps [rcx+4*16], xmm2 movaps [rcx+2*16], xmm3 movaps [rcx+6*16], xmm4 ;odd part - B2/~B1a (3a) addps xmm5, xmm7 ;xmm5 = b2[5] = b3[5]+b3[7] subps xmm7, xmm1 ;xmm7 = b2[7] = b3[7]-b3[4] subps xmm1, xmm6 ;xmm1 = b2[4] = b3[4]-b3[6] ;odd part - ~B1b/M (2a5m) movaps xmm0, xmm1 mulps xmm7, c4 ;xmm7 = m[7] = c4*b2[7] movaps xmm2, xmm5 mulps xmm0, c6 mulps xmm1, c2 mulps xmm2, c2 mulps xmm5, c6 addps xmm0, xmm2 ;xmm0 = m[4] = c6*b2[4] + c2*b2[5] subps xmm1, xmm5 ;xmm1 = m[5] = c2*b2[4] - c6*b2[5] ;odd part - R1a (2a) movaps xmm5, xmm6 addps xmm6, xmm7 ;xmm6 = r1a[6] = m[6]+m[7] subps xmm5, xmm7 ;xmm5 = r1a[7] = m[6]-m[7] ;odd part - R1b (4a) movaps xmm3, xmm5 movaps xmm4, xmm6 subps xmm5, xmm0 ;xmm5 = r1b[7] = r1a[7]-r1a[4] subps xmm6, xmm1 ;xmm6 = r1b[6] = r1a[6]-r1a[5] addps xmm4, xmm1 ;xmm4 = r1b[5] = r1a[6]+r1a[5] addps xmm3, xmm0 ;xmm3 = r1b[4] = r1a[7]+r1a[4] ;odd part - D (4a) mulps xmm3, [d+1*16] mulps xmm4, [d+5*16] mulps xmm6, [d+3*16] mulps xmm5, [d+7*16] ;odd part - writeout movaps [rcx+1*16], xmm3 movaps [rcx+5*16], xmm4 movaps [rcx+3*16], xmm6 movaps [rcx+7*16], xmm5 movhlps xmm6, xmm14 movhlps xmm7, xmm15 ret vdasm_mpegaudio_polyphase_dct4x8 endp ;void vdasm_mpegaudio_polyphase_matrixout_stereo(const float (*pSrc)[16], const float *pWinFwd, const float *pWinRev, int inc, const uint32 *pSampleInv, const sint16 *pDst, const float (*pSrcFinal)[16], const uint32 *pFinalMask); vdasm_mpegaudio_polyphase_matrixout_stereo proc public ;rcx = pointer to subband samples ;rdx = pointer to forward window ;r8 = pointer to reverse window ;r9 = source increment movlhps xmm15, xmm6 movlhps xmm14, xmm7 movsxd r9, r9d mov r10, [rsp+40] ;r10 = pointer to sample inversion value mov r11, [rsp+48] ;r11 = pointer to first two forward destination samples lea rax, [r11+120] ;rax = pointer to first two reverse destination samples ;compute first sample (0) movaps xmm5, xmmword ptr invother movups xmm0, [rdx] ;load window samples 0-3 xorps xmm0, xmm5 ;toggle signs on odd window samples movaps xmm1, xmm0 mulps xmm0, [rcx] ;multiply by left subband samples mulps xmm1, [rcx+64] ;multiply by right subband samples movups xmm2, [rdx+16] ;load window samples 4-7 xorps xmm2, xmm5 ;toggle signs on odd window samples movaps xmm3, xmm2 mulps xmm2, [rcx+16] ;multiply by left subband samples mulps xmm3, [rcx+80] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [rdx+32] ;load window samples 8-11 xorps xmm2, xmm5 ;toggle signs on odd window samples movaps xmm3, xmm2 mulps xmm2, [rcx+32] ;multiply by left subband samples mulps xmm3, [rcx+96] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [rdx+48] ;load window samples 12-15 xorps xmm2, xmm5 ;toggle signs on odd window samples movaps xmm3, xmm2 mulps xmm2, [rcx+48] ;multiply by left subband samples mulps xmm3, [rcx+112] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movaps xmm2, xmm0 ;xmm2 = l3 | l2 | l1 | l0 movlhps xmm0, xmm1 ;xmm0 = r1 | r0 | l1 | l0 movhlps xmm1, xmm2 ;xmm1 = r3 | r2 | l3 | l2 addps xmm0, xmm1 ;xmm0 = r1+r3 | r0+r2 | l1+l3 | l0+l2 shufps xmm0, xmm0, 11011000b ;xmm0 = r1+r3 | l1+l3 | r0+r2 | l0+l2 movhlps xmm3, xmm0 ;xmm3 = ? | ? | r1+r3 | l1+l3 movaps xmm4, [r10] movhlps xmm4, xmm4 addps xmm0, xmm3 ;xmm0 = ? | ? | r | l xorps xmm0, xmm4 cvtps2dq xmm0, xmm0 packssdw xmm0, xmm0 movd dword ptr [r11-4], xmm0 add rdx, 128 add r8, 128 add rcx, r9 ;compute reflected samples (1-15, 17-31) xloop: movups xmm2, [r8+48] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [rdx] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [rcx] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 movaps xmm4, xmm0 ;xmm4 = left forward movaps xmm5, xmm1 ;xmm5 = left reverse movaps xmm0, [rcx+64] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 movaps xmm6, xmm0 ;xmm6 = right forward movaps xmm7, xmm1 ;xmm7 = right reverse movups xmm2, [r8+32] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [rdx+16] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [rcx+16] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm4, xmm0 ;xmm4 += left forward addps xmm5, xmm1 ;xmm5 += left reverse movaps xmm0, [rcx+80] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm6, xmm0 ;xmm6 += right forward addps xmm7, xmm1 ;xmm7 += right reverse movups xmm2, [r8+16] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [rdx+32] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [rcx+32] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm4, xmm0 ;xmm4 += left forward addps xmm5, xmm1 ;xmm5 += left reverse movaps xmm0, [rcx+96] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm6, xmm0 ;xmm6 += right forward addps xmm7, xmm1 ;xmm7 += right reverse movups xmm2, [r8] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [rdx+48] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [rcx+48] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm4, xmm0 ;xmm4 += left forward addps xmm5, xmm1 ;xmm5 += left reverse movaps xmm0, [rcx+112] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm6, xmm0 ;xmm6 += right forward addps xmm7, xmm1 ;xmm7 += right reverse movaps xmm0, xmm4 ;xmm0 = lf3 | lf2 | lf1 | lf0 movaps xmm1, xmm5 movlhps xmm0, xmm6 ;xmm0 = rf0 | rf1 | lf1 | lf0 movlhps xmm1, xmm7 movhlps xmm6, xmm4 ;xmm6 = rf3 | rf2 | lf3 | lf2 movhlps xmm7, xmm5 addps xmm0, xmm6 ;xmm0 = rf0+rf3 | rf1+rf2 | lf1+lf3 | lf0+lf2 addps xmm1, xmm7 movaps xmm2, xmm0 movaps xmm3, xmm1 shufps xmm0, xmm0, 10110001b ;xmm0 = rf1+rf2 | rf0+rf3 | lf0+lf2 | lf1+lf3 shufps xmm1, xmm1, 10110001b addps xmm0, xmm2 ;xmm0 = rf | rf | lf | lf addps xmm1, xmm3 ;xmm1 = rb | rb | lb | lb shufps xmm0, xmm1, 10001000b ;xmm0 = rf | lf | rb | lb xorps xmm0, [r10] cvtps2dq xmm0, xmm0 packssdw xmm0, xmm0 movd dword ptr [rax], xmm0 psrldq xmm0, 4 movd dword ptr [r11], xmm0 add r11,4 sub rax,4 add rcx,r9 add rdx,128 add r8,128 cmp r11,rax jne xloop ;do last sample (16) mov rcx, [rsp+56] mov rax, [rsp+64] movaps xmm5, [rax] ;load final mask (masks out every other sample) movups xmm0, [rdx] ;load window samples 0-3 andps xmm0, xmm5 ;mask out every other sample movaps xmm1, xmm0 mulps xmm0, [rcx] ;multiply by left subband samples mulps xmm1, [rcx+64] ;multiply by right subband samples movups xmm2, [rdx+16] ;load window samples 4-7 andps xmm2, xmm5 ;mask out every other sample movaps xmm3, xmm2 mulps xmm2, [rcx+16] ;multiply by left subband samples mulps xmm3, [rcx+80] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [rdx+32] ;load window samples 8-11 andps xmm2, xmm5 ;mask out every other sample movaps xmm3, xmm2 mulps xmm2, [rcx+32] ;multiply by left subband samples mulps xmm3, [rcx+96] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [rdx+48] ;load window samples 12-15 andps xmm2, xmm5 ;mask out every other sample movaps xmm3, xmm2 mulps xmm2, [rcx+48] ;multiply by left subband samples mulps xmm3, [rcx+112] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movaps xmm2, xmm0 ;xmm2 = l3 | l2 | l1 | l0 movlhps xmm0, xmm1 ;xmm0 = r1 | r0 | l1 | l0 movhlps xmm1, xmm2 ;xmm1 = r3 | r2 | l3 | l2 addps xmm0, xmm1 ;xmm0 = r1+r3 | r0+r2 | l1+l3 | l0+l2 shufps xmm0, xmm0, 11011000b ;xmm0 = r1+r3 | l1+l3 | r0+r2 | l0+l2 movhlps xmm3, xmm0 ;xmm3 = ? | ? | r1+r3 | l1+l3 addps xmm0, xmm3 ;xmm0 = ? | ? | r | l xorps xmm0, invall cvtps2dq xmm0, xmm0 packssdw xmm0, xmm0 movd dword ptr [r11], xmm0 movhlps xmm6, xmm15 movhlps xmm7, xmm14 ret vdasm_mpegaudio_polyphase_matrixout_stereo endp end